Big Lasso
## Warning: package 'rmdformats' was built under R version 4.0.3
PCA
Principal Component Analysis, or PCA, is a popular technique applicable to different types such as dimensionality reduction, data compression, feature extraction, and visualization.
Basically, it is used to project a dataset from many correlated coordinates onto fewer uncorrelated coordinates called principal components while still retaining most of the variability present in the data.
###########################################
################### PCA ###################
###########################################
library(bigstatsr)
set.seed(1)
X <- big_attachExtdata()
n <- nrow(X)
X[1:2,1:3] [,1] [,2] [,3]
[1,] 2 2 0
[2,] 1 2 1
[1] 517 4542
# We only use only half of the data
ind <- sort(sample(n, n/2))
test <- big_SVD(X, fun.scaling = big_scale(),
ind.row = ind)
str(test)List of 5
$ d : num [1:10] 178.5 114.5 91 87.1 86.3 ...
$ u : num [1:258, 1:10] -0.1092 -0.0928 -0.0806 -0.0796 -0.1028 ...
$ v : num [1:4542, 1:10] 0.00607 0.00739 0.02921 -0.01283 0.01473 ...
$ center: num [1:4542] 1.34 1.63 1.51 1.64 1.09 ...
$ scale : num [1:4542] 0.665 0.551 0.631 0.55 0.708 ...
- attr(*, "class")= chr "big_SVD"
# A more realistic projection based on the scores
scores <- test$u %*% diag(test$d)
plot(scores[,2]~scores[,1])
# using the classical pca from R
pca <- prcomp(X[ind, ], center = TRUE, scale. = TRUE)
# same scaling
all.equal(test$center, pca$center)[1] TRUE
[1] TRUE
# projecting on new data
ind2 <- setdiff(rows_along(X), ind)
scores.test2 <- predict(test, X, ind.row = ind2)
plot(scores[,2]~scores[,1])
points(scores.test2[,2]~scores.test2[,1],col="red")# We start by reading an image and we perform SVDs on this image.
if (!"jpeg" %in% installed.packages())
install.packages("jpeg")
# Read image file into an array with three channels
# (Red-Green-Blue, RGB)
ivy <- jpeg::readJPEG("Ivy.jpg")
r <- ivy[, , 1] ; g <- ivy[, , 2] ; b <- ivy[, , 3]
# Performs full SVD of each channel
ivy.r.svd <- svd(r) ; ivy.g.svd <- svd(g) ;
ivy.b.svd <- svd(b)
rgb.svds <- list(ivy.r.svd, ivy.g.svd, ivy.b.svd)
# These two functions will be needed to display an image stored in an
# RGB array:
# Function to display an image stored in an RGB array
plot.image <- function(pic, main = "") {
h <- dim(pic)[1] ; w <- dim(pic)[2]
plot(x = c(0, h), y = c(0, w), type = "n", xlab = "",
ylab = "", main = main)
rasterImage(pic, 0, 0, h, w)
}
compress.image <- function(rgb.svds, nb.comp) {
# nb.comp (number of components) should be less than min(dim(img[,,1])),
# i.e., 170 here
svd.lower.dim <- lapply(rgb.svds, function(i)
list(d = i$d[1:nb.comp],
u = i$u[, 1:nb.comp],
v = i$v[, 1:nb.comp]))
img <- sapply(svd.lower.dim, function(i) {
img.compressed <- i$u %*% diag(i$d) %*% t(i$v)
}, simplify = 'array')
img[img < 0] <- 0
img[img > 1] <- 1
return(list(img = img, svd.reduced = svd.lower.dim))
}
par(mfrow = c(1, 2))
plot.image(ivy, "Original image")
p <- 20 ; plot.image(compress.image(rgb.svds, p)$img,
paste("SVD with", p, "components"))19691960 bytes
653720 bytes
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("mixOmics", version = "3.8")
library(mixOmics)
data(liver.toxicity)
X <- liver.toxicity$gene
MyResult.pca <- pca(X) # 1 Run the method
plotIndiv(MyResult.pca) # 2 Plot the samples# Customize plots: two factors displayed
plotIndiv(MyResult.pca, ind.names = FALSE,
group = liver.toxicity$treatment$Dose.Group,
pch = as.factor(liver.toxicity$treatment$Time.Group),
legend = TRUE, title = 'Liver toxicity: genes, PCA comp 1 - 2',
legend.title = 'Dose', legend.title.pch = 'Exposure')# second PCA with 3 components:
MyResult.pca2 <- pca(X, ncomp = 3)
plotIndiv(MyResult.pca2, comp = c(1,3), legend = TRUE,
group = liver.toxicity$treatment$Time.Group,
title = 'Multidrug transporter, PCA comp 1 - 3')# Here, the 3rd component on the y-axis clearly highlights a time of
# exposure effect.
# Amount of variance explained and choice of number of
# components
MyResult.pca3 <- pca(X, ncomp = 10)
plot(MyResult.pca3)
# We can also have a look at the variable coefficients in each
# component with the loading vectors.
# a minimal example
plotLoadings(MyResult.pca)# a customized example to only show the top 100 genes
# and their gene name
plotLoadings(MyResult.pca, ndisplay = 100,
name.var = liver.toxicity$gene.ID[, "geneBank"],
size.name = rel(0.3))
plotIndiv(MyResult.pca2,
group = liver.toxicity$treatment$Dose.Group,
style="3d",legend = TRUE,
title = 'Liver toxicity: genes, PCA comp 1 - 2 - 3')PLS
PLS is a family of multivariate statistical techniques based on dimension reduction
#######################################
####################### PLS ###########
#######################################
# We first set up the data as X expression matrix and Y as the lipid abundance matrix. We also
# check that the dimensions are correct and match:
data(nutrimouse)
X <- nutrimouse$gene
Y <- nutrimouse$lipid
dim(X); dim(Y)[1] 40 120
[1] 40 21
plotIndiv(MyResult.pls, group = nutrimouse$genotype,
rep.space = "XY-variate", legend = TRUE,
legend.title = 'Genotype',
ind.names = nutrimouse$diet,
title = 'Nutrimouse: PLS')# A cut-off can be set to display only the variables that mostly contribute to the definition of each component.
plotVar(MyResult.pls, cutoff=0.5)my.vip <- sort(vip(MyResult.pls)[,1],decreasing = TRUE)
barplot(my.vip[1:50],
beside = FALSE,
ylim = c(0, max(my.vip)), legend = rownames(my.vip)[1:50],
main = "Variable Importance in the Projection", font.main = 4)# The loading plots help visualise the coefficients assigned to each selected variable on each component:
plotLoadings(MyResult.pls, comp = 1, size.name = rel(0.5))# We run a PLS model with a sufficient number of components first, then run perf on the
# object.
MyResult.pls <- pls(X,Y, ncomp = 6)
set.seed(30) # for reproducbility
perf.pls <- perf(MyResult.pls, validation = "Mfold", folds = 5,
progressBar = FALSE, nrepeat = 10)
plot(perf.pls$Q2.total)
abline(h = 0.0975)